In [1]:
import numpy as np
import pandas as pd

create preparation_info document


In [2]:
info = pd.read_csv('../data/Demultiplex_Sheet.txt', sep='\t')

In [3]:
info = info.sort_values(['#SampleID'])
info.head()


Out[3]:
#SampleID BarcodeSequence LinkerPrimerSequence ReversePrimer Description
60 Orwoll.BI0023.BI TCTGGTGACATT GGACTACHVGGGTWTCTAAT GTGCCAGCMGCCGCGGTAA Orwoll.BI0023.BI
1129 Orwoll.BI0023.BI TCTGGTGACATT GGACTACHVGGGTWTCTAAT GTGCCAGCMGCCGCGGTAA Orwoll.BI0023.BI
649 Orwoll.BI0056.BI CAAGCATGCCTA GGACTACHVGGGTWTCTAAT GTGCCAGCMGCCGCGGTAA Orwoll.BI0056.BI
475 Orwoll.BI0056.BI CAAGCATGCCTA GGACTACHVGGGTWTCTAAT GTGCCAGCMGCCGCGGTAA Orwoll.BI0056.BI
75 Orwoll.BI0131.BI CTATTTGCGACA GGACTACHVGGGTWTCTAAT GTGCCAGCMGCCGCGGTAA Orwoll.BI0131.BI

In [4]:
info = info.drop_duplicates(subset=['#SampleID', 'ReversePrimer'])
# save unduplicated data
info.to_csv('unduplicated_demultiplex.txt', sep= '\t', index=False)

In [5]:
info.head()


Out[5]:
#SampleID BarcodeSequence LinkerPrimerSequence ReversePrimer Description
60 Orwoll.BI0023.BI TCTGGTGACATT GGACTACHVGGGTWTCTAAT GTGCCAGCMGCCGCGGTAA Orwoll.BI0023.BI
649 Orwoll.BI0056.BI CAAGCATGCCTA GGACTACHVGGGTWTCTAAT GTGCCAGCMGCCGCGGTAA Orwoll.BI0056.BI
75 Orwoll.BI0131.BI CTATTTGCGACA GGACTACHVGGGTWTCTAAT GTGCCAGCMGCCGCGGTAA Orwoll.BI0131.BI
932 Orwoll.BI0153.BI ATCGGCGTTACA GGACTACHVGGGTWTCTAAT GTGCCAGCMGCCGCGGTAA Orwoll.BI0153.BI
833 Orwoll.BI0215.BI CCTCTCGTGATC GGACTACHVGGGTWTCTAAT GTGCCAGCMGCCGCGGTAA Orwoll.BI0215.BI

In [6]:
info.shape[0]


Out[6]:
600

In [7]:
n = info.shape[0]
data = {'Sample_Name': info.loc[:,'#SampleID'].str[7:13].values, # .values takes only value not index
       'Barcode': info.loc[:,'BarcodeSequence'].values,
       'LinkerPrimerSequence': info.loc[:, 'LinkerPrimerSequence'].values,
       'Description': info.loc[:,'Description'].values,
       'Experiment_Design_Description': np.repeat(
           '16S stool samples sequenced for MrOS Vitamin D study', n),
       'Library_Construction_Protocol': np.repeat('16S rRNA v4', n),
       'Linker': np.repeat('GT', n),
       'Platform': np.repeat('Illumina', n),
       'Center_Name': info.loc[:,'#SampleID'].str[14:16].values,
       'Center_Project': np.repeat('MrOS', n),
       'Instrument_Model': np.repeat('Illumina MiSeq', n)}
prep_info = pd.DataFrame(data, columns=['Sample_Name', 'Barcode', 'LinkerPrimerSequence', 
                                        'Description', 
                                        'Experiment_Design_Description',
                                        'Library_Construction_Protocol', 'Linker', 'Platform',
                                        'Center_Name', 'Center_Project', 'Instrument_Model'])

In [8]:
prep_info.shape


Out[8]:
(600, 11)

In [9]:
prep_info.head(10)


Out[9]:
Sample_Name Barcode LinkerPrimerSequence Description Experiment_Design_Description Library_Construction_Protocol Linker Platform Center_Name Center_Project Instrument_Model
0 BI0023 TCTGGTGACATT GGACTACHVGGGTWTCTAAT Orwoll.BI0023.BI 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq
1 BI0056 CAAGCATGCCTA GGACTACHVGGGTWTCTAAT Orwoll.BI0056.BI 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq
2 BI0131 CTATTTGCGACA GGACTACHVGGGTWTCTAAT Orwoll.BI0131.BI 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq
3 BI0153 ATCGGCGTTACA GGACTACHVGGGTWTCTAAT Orwoll.BI0153.BI 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq
4 BI0215 CCTCTCGTGATC GGACTACHVGGGTWTCTAAT Orwoll.BI0215.BI 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq
5 BI0353 TGCCATCTGAAT GGACTACHVGGGTWTCTAAT Orwoll.BI0353.BI 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq
6 BI0371 GAATAGAGCCAA GGACTACHVGGGTWTCTAAT Orwoll.BI0371.BI 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq
7 BI0372 ATGCCAACCAAC GGACTACHVGGGTWTCTAAT Orwoll.BI0372.BI 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq
8 BI0380 GAGTCGCGTTTA GGACTACHVGGGTWTCTAAT Orwoll.BI0380.BI 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq
9 BI0389 GCTATGGACCGA GGACTACHVGGGTWTCTAAT Orwoll.BI0389.BI 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq

In [10]:
# fix mismatch sample name 'BI0778' should be 'BIO778' (Nora's email on08/07/2017)
prep_info = prep_info.replace(to_replace='BIO778', value='BI0778')

Create sample_info document


In [2]:
samples_part1 = pd.read_csv('../data/VitDMetadata_update.csv', sep=',')
samples_part2 = pd.read_csv('../data/Other32metadata.csv', sep=',')

In [3]:
print(samples_part1.shape)
print(samples_part2.shape)


(567, 26)
(32, 26)

In [4]:
print(samples_part1.columns)
print(samples_part2.columns)


Index(['ID', 'GIERACE', 'SITE', 'TUDRAMT', 'HWWGT', 'V4AGE1', 'HWHGT',
       'PASCORE', 'HWBMI', 'TURSMOKE', 'DTVITD', 'M1ADEPR', 'M1VITMND',
       'M1ANTIB', 'M1PROBI', 'OHV1D3', 'OHV24D3', 'OHVD3', 'OHVD2', 'OHV1D2',
       'OHV1D2CT', 'OHVD2CT', 'OHVDTOT', 'OHV1DTOT', 'OHSEAS', 'VDstatus'],
      dtype='object')
Index(['ID', 'GIERACE', 'SITE', 'TUDRAMT', 'HWWGT', 'V4AGE1', 'HWHGT',
       'PASCORE', 'HWBMI', 'TURSMOKE', 'DTVITD', 'M1ADEPR', 'M1VITMND',
       'M1ANTIB', 'M1PROBI', 'OHV1D3', 'OHV24D3', 'OHVD3', 'OHVD2', 'OHV1D2',
       'OHV1D2CT', 'OHVD2CT', 'OHVDTOT', 'OHV1DTOT', 'OHSEAS', 'VDstatus'],
      dtype='object')

In [5]:
# merge two metadata files
samples = pd.concat([samples_part1, samples_part2], ignore_index=True)

In [6]:
samples.shape


Out[6]:
(599, 26)

In [7]:
#samples.M1ANTIB.value_counts()


Out[7]:
0: No     558
1: Yes     41
Name: M1ANTIB, dtype: int64

In [16]:
len(pd.unique(samples.ID))


Out[16]:
599

In [17]:
samples.head(10)


Out[17]:
ID GIERACE SITE TUDRAMT HWWGT V4AGE1 HWHGT PASCORE HWBMI TURSMOKE ... OHV24D3 OHVD3 OHVD2 OHV1D2 OHV1D2CT OHVD2CT OHVDTOT OHV1DTOT OHSEAS VDstatus
0 BI0023 1:WHITE Birmingham 1: Less than one drink per week 83.1 83 169.60 91.000000 28.890119 M:Not Applicable ... 1.77 25.8 0 0 1: Yes 1: Yes 25.8 39.3 3:SUMMER sufficiency
1 BI0056 1:WHITE Birmingham 0:None drinker 76.2 81 163.40 199.178571 28.539796 1:PAST ... 3.91 39.2 0 0 1: Yes 1: Yes 39.2 61.9 2:SPRING sufficiency
2 BI0131 1:WHITE Birmingham 0:None drinker 78.5 83 177.15 161.714286 25.014242 1:PAST ... 1.49 23.1 0 0 1: Yes 1: Yes 23.1 52.1 2:SPRING sufficiency
3 BI0153 1:WHITE Birmingham 4: 6-13 drinks per week 95.1 79 175.50 88.214286 30.876373 1:PAST ... 2.14 27.3 0 0 1: Yes 1: Yes 27.3 43.1 2:SPRING sufficiency
4 BI0215 1:WHITE Birmingham 3: 3-5 drinks per week 93.0 81 166.40 256.821429 33.587394 1:PAST ... 3.62 33 0 0 1: Yes 1: Yes 33 50.2 4:FALL sufficiency
5 BI0353 1:WHITE Birmingham 0:None drinker 83.6 80 177.90 179.571429 26.415229 1:PAST ... 1.79 19.5 0 0 1: Yes 1: Yes 19.5 45.5 2:SPRING deficiency
6 BI0371 2:AFRICAN AMERICAN Birmingham 3: 3-5 drinks per week 102.6 79 190.10 144.571429 28.391159 0:NO ... 1.82 22.3 0 0 1: Yes 1: Yes 22.3 60 2:SPRING sufficiency
7 BI0372 1:WHITE Birmingham 1: Less than one drink per week 85.3 80 175.90 91.678571 27.568768 0:NO ... 3.37 37.2 0.7 0 1: Yes 0: No 37.9 53.6 2:SPRING sufficiency
8 BI0380 1:WHITE Birmingham 0:None drinker 110.6 85 192.35 65.607143 29.893086 1:PAST ... 2.55 45.7 0 0 1: Yes 1: Yes 45.7 50.2 2:SPRING sufficiency
9 BI0389 1:WHITE Birmingham 0:None drinker 72.2 85 177.70 209.714286 22.864533 0:NO ... 1.93 26.7 0 0 1: Yes 1: Yes 26.7 49.7 2:SPRING sufficiency

10 rows × 26 columns


In [18]:
sites = samples.loc[:, 'SITE']

In [19]:
# reference: http://www.latlong.net/
Latitude = []
Longitude = []
sites = samples.loc[:, 'SITE']
for i in range(samples.shape[0]):
    if sites[i] == 'Birmingham':
        latitude = '33.520661'
        longitude = '-86.80249'
    elif sites[i] == 'San Diego':
        latitude = '32.715738'
        longitude = '-117.1611'
    elif sites[i] == 'Pittsburgh':
        latitude = '40.440625'
        longitude = '-79.99589'
    elif sites[i] == 'Palo Alto':
        latitude = '37.441883'
        longitude = '-122.143'
    elif sites[i] == 'Portland':
        latitude = '45.523062'
        longitude = '-122.6765'    
    elif sites[i] == 'Minneapolis':
        latitude = '44.977753'
        longitude = '-93.26501'  
    Latitude.append(latitude)
    Longitude.append(longitude)

In [20]:
# simple check
print(samples['SITE'][[1,90,200,300,400, 500]])
print(np.array(Latitude)[[1,90,200,300,400, 500]])
print(np.array(Longitude)[[1,90,200,300,400, 500]])


1       Birmingham
90     Minneapolis
200      Palo Alto
300     Pittsburgh
400       Portland
500      San Diego
Name: SITE, dtype: object
['33.520661' '44.977753' '37.441883' '40.440625' '45.523062' '32.715738']
['-86.80249' '-93.26501' '-122.143' '-79.99589' '-122.6765' '-117.1611']

In [21]:
m = samples.shape[0]
required = {'Sample_Name': samples.loc[:,'ID'].values,
        'Title': np.repeat('MrOS_VitaminD', m),
        'Anonymized_Name': samples.loc[:,'ID'].values,
        'Scientific_Name': np.repeat('human gut metagenome', m),
        'Taxon_ID': np.repeat('Not applicable', m),
        #'Description': np.repeat('Not applicable', m),
        'Sample_Type': np.repeat('stool', m),
        'Geo_Loc_Name': samples.loc[:, 'SITE'].values,
        'Elevation': np.repeat('Not applicable', m),
        'Env_Biome': np.repeat('urban biome', m),
        'Env_Feature': np.repeat('human-associated habitat', m),
        'Env_Material': np.repeat('feces', m),
        'Env_Package': np.repeat('human-gut', m),
        'Latitude': Latitude,
        'Longitude': Longitude,
        'Collection_Timestamp': np.repeat('Not applicable', m),
        'DNA_Extracted': np.repeat('Not applicable', m),
        'Physical_Specimen_Location': np.repeat('Not applicable', m),
        'Physical_Specimen_Remaining': np.repeat('Not applicable', m),
        'Age': samples.loc[:,'V4AGE1'].values,
        'Age_Units': np.repeat('years', m),
        'Host_Subject_ID': np.repeat('Not applicable', m),
        'Host_Taxid': np.repeat('Not applicable', m),
        'Host_Scientific_Name': np.repeat('Homo sapiens', m),
        'Host_Common_Name': np.repeat('human', m),
        'Life_Stage': np.repeat('adult', m),
        'Sex': np.repeat('male', m),
        'Height': samples.loc[:, 'HWHGT'].values,
        'Height_Units': np.repeat('cm', m),
        'Weight': samples.loc[:, 'HWWGT'].values,
        'Weight_Units': np.repeat('kg', m),  
        'BMI': samples.loc[:, 'HWBMI'].values,
        'Body_Habitat': np.repeat('UBERON:feces', m),
        'Body_Site': np.repeat('UBERON:feces', m),
        'Body_Product': np.repeat('UBERON:feces', m)}

In [22]:
sample_info = pd.concat([pd.DataFrame(required), samples], axis=1)
sample_info = pd.DataFrame(sample_info, columns = ['Sample_Name', 'Title', 'Anonymized_Name',
          'Scientific_Name', 'Taxon_ID', 
          #'Description', 
          'Sample_Type', 'Geo_Loc_Name', 'Elevation',
          'Env_Biome', 'Env_Feature', 'Env_Material', 'Env_Package', 'Latitude', 'Longitude',
          'Collection_Timestamp', 'DNA_Extracted', 'Physical_Specimen_Location', 'Physical_Specimen_Remaining',
          'Age', 'Age_Units', 'Host_Subject_ID', 'Host_Taxid', 'Host_Scientific_Name', 'Host_Common_Name',
          'Life_Stage', 'Sex', 'Height', 'Height_Units', 'Weight', 'Weight_Units', 'BMI', 'Body_Habitat',
          'Body_Site', 'Body_Product', 'GIERACE', 'SITE', 'TUDRAMT', 'PASCORE', 'TURSMOKE', 'DTVITD', 
          'M1ADEPR', 'M1VITMND', 'M1ANTIB', 'M1PROBI', 'OHV1D3', 'OHV24D3', 
          'OHVD3', 'OHVD2', 'OHV1D2', 'OHV1D2CT', 'OHVD2CT', 'OHVDTOT', 'OHV1DTOT', 'OHSEAS', 'VDstatus'])

In [23]:
sample_info.shape


Out[23]:
(599, 55)

In [24]:
sample_info.head(10)


Out[24]:
Sample_Name Title Anonymized_Name Scientific_Name Taxon_ID Sample_Type Geo_Loc_Name Elevation Env_Biome Env_Feature ... OHV24D3 OHVD3 OHVD2 OHV1D2 OHV1D2CT OHVD2CT OHVDTOT OHV1DTOT OHSEAS VDstatus
0 BI0023 MrOS_VitaminD BI0023 human gut metagenome Not applicable stool Birmingham Not applicable urban biome human-associated habitat ... 1.77 25.8 0 0 1: Yes 1: Yes 25.8 39.3 3:SUMMER sufficiency
1 BI0056 MrOS_VitaminD BI0056 human gut metagenome Not applicable stool Birmingham Not applicable urban biome human-associated habitat ... 3.91 39.2 0 0 1: Yes 1: Yes 39.2 61.9 2:SPRING sufficiency
2 BI0131 MrOS_VitaminD BI0131 human gut metagenome Not applicable stool Birmingham Not applicable urban biome human-associated habitat ... 1.49 23.1 0 0 1: Yes 1: Yes 23.1 52.1 2:SPRING sufficiency
3 BI0153 MrOS_VitaminD BI0153 human gut metagenome Not applicable stool Birmingham Not applicable urban biome human-associated habitat ... 2.14 27.3 0 0 1: Yes 1: Yes 27.3 43.1 2:SPRING sufficiency
4 BI0215 MrOS_VitaminD BI0215 human gut metagenome Not applicable stool Birmingham Not applicable urban biome human-associated habitat ... 3.62 33 0 0 1: Yes 1: Yes 33 50.2 4:FALL sufficiency
5 BI0353 MrOS_VitaminD BI0353 human gut metagenome Not applicable stool Birmingham Not applicable urban biome human-associated habitat ... 1.79 19.5 0 0 1: Yes 1: Yes 19.5 45.5 2:SPRING deficiency
6 BI0371 MrOS_VitaminD BI0371 human gut metagenome Not applicable stool Birmingham Not applicable urban biome human-associated habitat ... 1.82 22.3 0 0 1: Yes 1: Yes 22.3 60 2:SPRING sufficiency
7 BI0372 MrOS_VitaminD BI0372 human gut metagenome Not applicable stool Birmingham Not applicable urban biome human-associated habitat ... 3.37 37.2 0.7 0 1: Yes 0: No 37.9 53.6 2:SPRING sufficiency
8 BI0380 MrOS_VitaminD BI0380 human gut metagenome Not applicable stool Birmingham Not applicable urban biome human-associated habitat ... 2.55 45.7 0 0 1: Yes 1: Yes 45.7 50.2 2:SPRING sufficiency
9 BI0389 MrOS_VitaminD BI0389 human gut metagenome Not applicable stool Birmingham Not applicable urban biome human-associated habitat ... 1.93 26.7 0 0 1: Yes 1: Yes 26.7 49.7 2:SPRING sufficiency

10 rows × 55 columns

Match sample_name in two documents


In [25]:
id_prep=list(prep_info.loc[:,'Sample_Name'].values)
id_sample=list(sample_info.loc[:,'Sample_Name'].values)

In [26]:
# samples with metadata yet no sequencing data
for i in id_sample:
    if i not in id_prep:
        print(i)

In [27]:
# samples with sequencing data yet no metadata
no_sample_info = []
for j in id_prep:
    if j not in id_sample:
        no_sample_info.append(j)

In [28]:
print(len(no_sample_info))
print(no_sample_info)


1
['PO7100']

Drop un-needed samples


In [29]:
# exclude PO7100, as it has no microbiome data (see Lily Liu's email on 08/07/2017)
prep_info = prep_info.loc[~prep_info['Sample_Name'].isin(no_sample_info)]

In [30]:
prep_info.shape


Out[30]:
(599, 11)

preparation and sample information output


In [31]:
sample_info.to_csv('../data/sample_MrOS.txt', sep= '\t', na_rep='Missing:not collected', index=False)
prep_info.to_csv('../data/prep_MrOS.txt', sep= '\t', index=False)

create Qiime mapping file


In [32]:
prep_info.head(5)


Out[32]:
Sample_Name Barcode LinkerPrimerSequence Description Experiment_Design_Description Library_Construction_Protocol Linker Platform Center_Name Center_Project Instrument_Model
0 BI0023 TCTGGTGACATT GGACTACHVGGGTWTCTAAT Orwoll.BI0023.BI 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq
1 BI0056 CAAGCATGCCTA GGACTACHVGGGTWTCTAAT Orwoll.BI0056.BI 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq
2 BI0131 CTATTTGCGACA GGACTACHVGGGTWTCTAAT Orwoll.BI0131.BI 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq
3 BI0153 ATCGGCGTTACA GGACTACHVGGGTWTCTAAT Orwoll.BI0153.BI 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq
4 BI0215 CCTCTCGTGATC GGACTACHVGGGTWTCTAAT Orwoll.BI0215.BI 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq

In [33]:
mapping = pd.merge(prep_info, sample_info, on='Sample_Name')

In [34]:
print(prep_info.shape)
print(sample_info.shape)
print(mapping.shape)


(599, 11)
(599, 55)
(599, 65)

In [35]:
mapping.rename(columns={'Sample_Name': '#SampleID'}, inplace=True)
mapping.rename(columns={'Barcode': 'BarcodeSequence'}, inplace=True)

In [36]:
mapping.shape


Out[36]:
(599, 65)

In [37]:
mapping.head(5)


Out[37]:
#SampleID BarcodeSequence LinkerPrimerSequence Description Experiment_Design_Description Library_Construction_Protocol Linker Platform Center_Name Center_Project ... OHV24D3 OHVD3 OHVD2 OHV1D2 OHV1D2CT OHVD2CT OHVDTOT OHV1DTOT OHSEAS VDstatus
0 BI0023 TCTGGTGACATT GGACTACHVGGGTWTCTAAT Orwoll.BI0023.BI 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS ... 1.77 25.8 0 0 1: Yes 1: Yes 25.8 39.3 3:SUMMER sufficiency
1 BI0056 CAAGCATGCCTA GGACTACHVGGGTWTCTAAT Orwoll.BI0056.BI 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS ... 3.91 39.2 0 0 1: Yes 1: Yes 39.2 61.9 2:SPRING sufficiency
2 BI0131 CTATTTGCGACA GGACTACHVGGGTWTCTAAT Orwoll.BI0131.BI 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS ... 1.49 23.1 0 0 1: Yes 1: Yes 23.1 52.1 2:SPRING sufficiency
3 BI0153 ATCGGCGTTACA GGACTACHVGGGTWTCTAAT Orwoll.BI0153.BI 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS ... 2.14 27.3 0 0 1: Yes 1: Yes 27.3 43.1 2:SPRING sufficiency
4 BI0215 CCTCTCGTGATC GGACTACHVGGGTWTCTAAT Orwoll.BI0215.BI 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS ... 3.62 33 0 0 1: Yes 1: Yes 33 50.2 4:FALL sufficiency

5 rows × 65 columns


In [38]:
# move 'Description' to the end
cols = list(mapping)
cols.insert(mapping.shape[1], cols.pop(cols.index('Description')))
mapping = mapping.loc[:, cols]

In [39]:
mapping.head(5)


Out[39]:
#SampleID BarcodeSequence LinkerPrimerSequence Experiment_Design_Description Library_Construction_Protocol Linker Platform Center_Name Center_Project Instrument_Model ... OHVD3 OHVD2 OHV1D2 OHV1D2CT OHVD2CT OHVDTOT OHV1DTOT OHSEAS VDstatus Description
0 BI0023 TCTGGTGACATT GGACTACHVGGGTWTCTAAT 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq ... 25.8 0 0 1: Yes 1: Yes 25.8 39.3 3:SUMMER sufficiency Orwoll.BI0023.BI
1 BI0056 CAAGCATGCCTA GGACTACHVGGGTWTCTAAT 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq ... 39.2 0 0 1: Yes 1: Yes 39.2 61.9 2:SPRING sufficiency Orwoll.BI0056.BI
2 BI0131 CTATTTGCGACA GGACTACHVGGGTWTCTAAT 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq ... 23.1 0 0 1: Yes 1: Yes 23.1 52.1 2:SPRING sufficiency Orwoll.BI0131.BI
3 BI0153 ATCGGCGTTACA GGACTACHVGGGTWTCTAAT 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq ... 27.3 0 0 1: Yes 1: Yes 27.3 43.1 2:SPRING sufficiency Orwoll.BI0153.BI
4 BI0215 CCTCTCGTGATC GGACTACHVGGGTWTCTAAT 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq ... 33 0 0 1: Yes 1: Yes 33 50.2 4:FALL sufficiency Orwoll.BI0215.BI

5 rows × 65 columns


In [40]:
mapping.to_csv('../data/mapping_MrOS.txt', sep= '\t', index=False)

In [ ]: